import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from chart_studio.plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
data = pd.read_csv('preprocessed_data.csv', nrows=5000)
data.head(2)
data.columns
data['project_is_approved'].value_counts()
y = data['project_is_approved'].values
X = data.drop(['project_is_approved'], axis=1)
X.head(2)
# preprocessed essays
print(X.shape, y.shape)
print("="*100)
vectorizer = CountVectorizer(min_df=10,ngram_range=(1,2), max_features=7000)
vectorizer.fit(X['preprocessed_essays'].values) # fit has to happen only on train data
# we use the fit CountVectorizer to convert the text to vector
X_train_essay_bow = vectorizer.transform(X['preprocessed_essays'].values)
essay_info = (vectorizer.get_feature_names(), X_train_essay_bow.toarray())
f1=vectorizer.get_feature_names()
print("After vectorization")
print(X_train_essay_bow.shape)
print("="*100)
#project_title
vectorizer1 = CountVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer1.fit(X['preprocessed_titles'].values.astype('U'))
X_train_title_bow = vectorizer1.transform(X['preprocessed_titles'].values.astype('U'))
f2=vectorizer.get_feature_names()
print("After vectorization")
print(X_train_title_bow.shape)
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X['school_state'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_state = vectorizer.transform(X['school_state'].values)
f5=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_state.shape)
print(f5)
print("="*100)
X['teacher_prefix'].unique()
vectorizer = CountVectorizer()
vectorizer.fit(X['teacher_prefix'].values)
X_train_teacher = vectorizer.transform(X['teacher_prefix'].values)
f6=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_teacher.shape)
print(f6)
print("="*100)
#This step is to intialize a vectorizer with vocab from train data
#Ref: https://www.kaggle.com/shashank49/donors-choose-knn#Concatinating-all-features-(TFIDF)
from collections import Counter
my_counter = Counter()
for word in X['project_grade_category'].values:
my_counter.update([word[i:i+14] for i in range(0, len(word),14)]) #https://www.geeksforgeeks.org/python-string-split/
# dict sort by value python: https://stackoverflow.com/a/613218/4084039
project_grade_category_dict = dict(my_counter)
sorted_project_grade_category_dict = dict(sorted(project_grade_category_dict.items(), key=lambda kv: kv[1]))
vectorizer = CountVectorizer(vocabulary=list(sorted_project_grade_category_dict.keys()), lowercase=False, binary=True,max_features=4)
vectorizer.fit(X['project_grade_category'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_grade = vectorizer.transform(X['project_grade_category'].values)
f7=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_grade.shape)
print(f7)
vectorizer = CountVectorizer()
vectorizer.fit(X['clean_categories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_cat = vectorizer.transform(X['clean_categories'].values)
f8=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_cat.shape)
print(f8)
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X['clean_subcategories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_subcat = vectorizer.transform(X['clean_subcategories'].values)
f9=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_subcat.shape)
print(f9)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer1 = Normalizer()
# normalizer.fit(X_train['price'].values)
#this will rise an error Expected 2D array, got 1D array instead:
normalizer1.fit(X['price'].values.reshape(-1,1))
X_train_price_norm = normalizer1.transform(X['price'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_price_norm.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X['quantity'].values.reshape(1,-1))
X_train_quantity_norm = normalizer.transform(X['quantity'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_quantity_norm.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X['teacher_number_of_previously_posted_projects'].values.reshape(1,-1))
X_train_projects_norm = normalizer.transform(X['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_projects_norm.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X['sentimental_score'].values.reshape(-1,1))
X_train_senti_norm = normalizer.transform(X['sentimental_score'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_senti_norm.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X['preprocessed_essay_word_count'].values.reshape(-1,1))
X_train_ewc_norm = normalizer.transform(X['preprocessed_essay_word_count'].values.reshape(-1,1))
print("After vectorization")
print(X_train_ewc_norm.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X['preprocessed_title_word_count'].values.reshape(-1,1))
X_train_twc_norm = normalizer.transform(X['preprocessed_title_word_count'].values.reshape(-1,1))
print("After vectorization")
print(X_train_twc_norm.shape)
print("="*100)
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
from scipy.sparse import hstack
X_stack = hstack((X_train_essay_bow, X_train_title_bow, X_train_state, X_train_teacher, X_train_grade, X_train_cat, X_train_subcat, X_train_price_norm, X_train_quantity_norm, X_train_projects_norm,X_train_senti_norm,X_train_ewc_norm,X_train_twc_norm )).tocsr()
print("Final Data Matrix")
print(X_stack.shape)
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import SelectKBest, f_classif
X_final = SelectKBest(f_classif, k=5000).fit_transform(X_stack,y)
print("Shape of the data matrix after dim reduction :", X_final.shape)
#saving the sparse matrix
# https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
from scipy import sparse
sparse.save_npz("X_final.npz", X_final)
np.save('y', y)
#loading the files
y = np.load('y.npy')
X_final = sparse.load_npz("X_final.npz")
from sklearn.cluster import KMeans
loss=[]
k=[4, 8, 16, 25, 35]
for i in tqdm(k): #using simple for loop
clf1 = KMeans(n_clusters=i, n_jobs=-1)
clf1.fit(X_final)
loss.append(clf1.inertia_)
plt.plot(k, loss)
plt.scatter(k, loss)
plt.xlabel("k")
plt.ylabel("Loss")
plt.title("k vs inertia_")
plt.grid()
plt.show()
# Applying best n_clusters
from sklearn.cluster import KMeans
clf2 = KMeans(n_clusters=15, random_state=0, n_jobs=-1).fit(X_final)
#cluster_dict = getClusterDict(essay_info, clf2.labels_)
#cluster_dict = dict(sorted(list(cluster_dict.items()), key=lambda x: x[0]))
for key, val in cluster_dict.items(): plotWordCloud(val, key,'KMeans')
#Ref: https://stackoverflow.com/questions/36195457/python-sklearn-kmeans-how-to-get-the-samples-points-in-each-clusters
cluster_map = pd.DataFrame()
cluster_map['data_index'] = X.index.values
cluster_map['essay']= X['preprocessed_essays'].values
cluster_map['cluster'] = clf2.labels_
cluster_map.head(5)
for i in range(0,15):
dfi=cluster_map[cluster_map.cluster == i]
print("Essay Wordcloud for cluster {} :".format(i))
from wordcloud import WordCloud
from collections import Counter
words = ' '
for row in dfi['essay'].values:
tokens = row.split()
for t in tokens:
words += t + ' '
wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
word_count=len(words.split()) #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
Counter = Counter(words.split())
most_occur = Counter.most_common(10)
print("Summary of words in Cluster {} :".format(i))
print("Number of words in the cluster: {}".format(word_count))
print("Most frequent words in the cluster: {}".format(most_occur))
print('\n\n')
# Ref: https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
import scipy.cluster.hierarchy as shc
X_agg = X_final.todense() #was getting a value error, hence converted to dense matrix
plt.figure(figsize=(10, 7))
plt.title("Dendogram")
dend = shc.dendrogram(shc.linkage(X_agg, method='ward'))
np.save('dense',X_agg)
X_new=np.load('dense.npy')
from sklearn.cluster import AgglomerativeClustering
clf3 = AgglomerativeClustering(n_clusters= 4, affinity='euclidean', linkage='ward')
clf3.fit_predict(X_new)
#Ref: https://stackoverflow.com/questions/36195457/python-sklearn-kmeans-how-to-get-the-samples-points-in-each-clusters
cluster_map = pd.DataFrame()
cluster_map['data_index'] = X.index.values
cluster_map['essay']= X['preprocessed_essays'].values
cluster_map['cluster'] = clf3.labels_
cluster_map.head(5)
for i in range(0,4):
dfi=cluster_map[cluster_map.cluster == i]
print("Essay Wordcloud for cluster {} :".format(i))
from wordcloud import WordCloud
from collections import Counter
words = ' '
for row in dfi['essay'].values:
tokens = row.split()
for t in tokens:
words += t + ' '
wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
word_count=len(words.split()) #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
Counter = Counter(words.split())
most_occur = Counter.most_common(10)
print("Summary of words in Cluster {} :".format(i))
print("Number of words in the cluster: {}".format(word_count))
print("Most frequent words in the cluster: {}".format(most_occur))
print('\n\n')
import math
print(math.log(5000))
#Ref:https://datascience.stackexchange.com/questions/10162/knn-distance-plot-for-determining-eps-of-dbscan
#https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=8).fit(X_new)
distances, indices = nbrs.kneighbors(X_new)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
#plt.plot(indices, distances)
plt.title("Plot to find best eps using elbow-knee method")
plt.xlabel('Point index')
plt.ylabel('eps-values')
from sklearn.cluster import DBSCAN
clf4 = DBSCAN(eps=15, min_samples=8).fit(X_new)
#Ref: https://stackoverflow.com/questions/36195457/python-sklearn-kmeans-how-to-get-the-samples-points-in-each-clusters
cluster_map = pd.DataFrame()
cluster_map['data_index'] = X.index.values
cluster_map['essay']= X['preprocessed_essays'].values
cluster_map['cluster'] = clf4.labels_
cluster_map.head(5)
cluster_map['cluster'].unique()
for i in range(0,1):
dfi=cluster_map[cluster_map.cluster == i]
print("Essay Wordcloud for cluster {} :".format(i))
from wordcloud import WordCloud
from collections import Counter
words = ' '
for row in dfi['essay'].values:
tokens = row.split()
for t in tokens:
words += t + ' '
wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
word_count=len(words.split()) #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
Counter = Counter(words.split())
most_occur = Counter.most_common(10)
print("Summary of words in Cluster {} :".format(i))
print("Number of words in the cluster: {}".format(word_count))
print("Most frequent words in the cluster: {}".format(most_occur))
print('\n\n')